#setwd("/home/gbakie/neu/stat-sp16/project/Online_News_Popularity")
setwd("/Users/Darshan/Documents/Online_News_Popularity")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
source("DataPreprocess.R")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'car' was built under R version 3.2.4
set.seed(464)
setwd("/Users/Darshan/Documents/CS 7280 Stats/Project/Data/")
#setwd("/home/gbakie/neu/stat-sp16/project/data")
news <- read.csv("Train.csv", header = TRUE)
news <- data_cleaning(news)
news <- correlation_cleaning(news)
# Can not apply transformation on weighted regression
return_obj <- target_transformation(news)
news <- return_obj$news
lamda <- return_obj$lambda
obj <- normalization(news)
news <- obj$news
news <- cat_encoding(news)
url <- news$url
news$url <- NULL
categorical_var <- c("data_channel_is_lifestyle",
"data_channel_is_entertainment", "data_channel_is_bus",
"data_channel_is_world", "data_channel_is_socmed",
"data_channel_is_tech", "weekday_is_monday", "weekday_is_tuesday",
"weekday_is_wednesday", "weekday_is_thursday", "weekday_is_friday",
"weekday_is_saturday", "weekday_is_sunday")
news_with_cat <- subset(news, select = categorical_var)
news <- subset(news, select = setdiff(names(news),categorical_var))
#news <- cook_outliers_removal(news)
ignored_column_names <- c("url", "timedelta", "data_channel_is_lifestyle",
"data_channel_is_entertainment", "data_channel_is_bus",
"data_channel_is_world", "data_channel_is_socmed",
"data_channel_is_tech", "weekday_is_monday", "weekday_is_tuesday", "weekday_is_wednesday", "weekday_is_thursday", "weekday_is_friday",
"weekday_is_saturday", "weekday_is_sunday", "is_weekend", "shares", "data_channel", "cat_dow")
column_names <- names(news)
needed_columns <- setdiff(column_names,ignored_column_names)
model <- lm(shares ~ data_channel +
cat_dow +
i_kw_max_avg_avg +
self_reference_avg_sharess +
i_kw_avg_max_max +
num_hrefs +
global_subjectivity +
LDA_00 +
LDA_01 +
LDA_02 +
num_self_hrefs +
i_n_unique_tokens_content +
i_title_subjectivity_sentiment_polarity +
abs_title_subjectivity +
n_tokens_title +
min_positive_polarity +
num_imgs +
average_token_length +
title_sentiment_polarity +
i_min_avg_negative_pol, data=news)
for(column in needed_columns){
print(column)
p <- ggplot(aes_string(x=column,y=model$residuals) ,data=news) + geom_point() + stat_smooth()
plot(p)
}
## [1] "n_tokens_title"

## [1] "num_hrefs"

## [1] "num_self_hrefs"

## [1] "num_imgs"

## [1] "num_videos"

## [1] "average_token_length"

## [1] "num_keywords"

## [1] "kw_min_avg"

## [1] "self_reference_avg_sharess"

## [1] "LDA_00"

## [1] "LDA_01"

## [1] "LDA_02"

## [1] "LDA_03"

## [1] "LDA_04"

## [1] "global_subjectivity"

## [1] "global_rate_positive_words"

## [1] "global_rate_negative_words"

## [1] "avg_positive_polarity"

## [1] "min_positive_polarity"

## [1] "max_positive_polarity"

## [1] "max_negative_polarity"

## [1] "title_sentiment_polarity"

## [1] "abs_title_subjectivity"

## [1] "i_n_unique_tokens_content"

## [1] "i_title_subjectivity_sentiment_polarity"

## [1] "i_min_avg_negative_pol"

## [1] "i_rate_pos_glob_sent_polarity"

## [1] "i_kw_max_avg_min"

## [1] "i_kw_max_avg_avg"

## [1] "i_kw_avg_max_max"
